Skip to content

Conversation

AZero13
Copy link
Contributor

@AZero13 AZero13 commented Oct 9, 2025

No description provided.

@AZero13 AZero13 closed this Oct 9, 2025
@AZero13 AZero13 reopened this Oct 9, 2025
@llvmbot
Copy link
Member

llvmbot commented Oct 9, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: AZero13 (AZero13)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/162732.diff

3 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5-2)
  • (modified) llvm/test/CodeGen/AMDGPU/extract-lowbits.ll (+29-18)
  • (modified) llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll (+6-8)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 88691b931a8d8..9865143edbdfb 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -839,8 +839,11 @@ class LLVM_ABI TargetLoweringBase {
   /// Return true if the variant with 2 variable shifts is preferred.
   /// Return false if there is no preference.
   virtual bool shouldFoldMaskToVariableShiftPair(SDValue X) const {
-    // By default, let's assume that no one prefers shifts.
-    return false;
+    // By default, let's assume that no one prefers shifts for vectors
+    EVT VT = X.getValueType();
+
+    // Prefer shifts for legal types
+    return isOperationLegal(ISD::SHL, VT);
   }
 
   /// Return true if it is profitable to fold a pair of shifts into a mask.
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 89bd5f1dc7969..59bc1dd019de3 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -103,16 +103,16 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT:    v_and_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_c0:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT:    v_and_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
@@ -121,12 +121,23 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 }
 
 define i32 @bzhi32_c0_clamp(i32 %val, i32 %numlowbits) nounwind {
-; GCN-LABEL: bzhi32_c0_clamp:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 31, v1
-; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: bzhi32_c0_clamp:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 31, v1
+; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_c0_clamp:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, 31, v1
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
   %low5bits = and i32 %numlowbits, 31
   %numhighbits = sub i32 32, %low5bits
   %mask = lshr i32 -1, %numhighbits
@@ -139,16 +150,16 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT:    v_and_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_c1_indexzext:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_sub_u16_e32 v1, 32, v1
-; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT:    v_and_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
@@ -162,16 +173,16 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT:    v_and_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_c4_commutative:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT:    v_and_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
diff --git a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
index 5b21a3652d11c..1055fc101a6c5 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
@@ -266,7 +266,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -279,9 +279,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
-; EG-NEXT:    -1(nan), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.X, PV.W, KC0[2].Y,
+; EG-NEXT:     LSHL * T1.W, KC0[2].Y, PV.W,
+; EG-NEXT:     LSHR T0.X, PV.W, T0.W,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
@@ -289,7 +288,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
@@ -302,9 +301,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
 ; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT * T0.W, PV.W, literal.x,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T0.W, literal.x, PV.W,
-; CM-NEXT:    -1(nan), 0(0.000000e+00)
-; CM-NEXT:     AND_INT * T0.X, PV.W, KC0[2].Y,
+; CM-NEXT:     LSHL * T1.W, KC0[2].Y, PV.W,
+; CM-NEXT:     LSHR * T0.X, PV.W, T0.W,
 ; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %numhighbits = sub i8 32, %numlowbits

Comment on lines +126 to +130
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 31, v1
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regression, broke a BFE pattern?

Comment on lines +106 to +107
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a neutral change at best

@jayfoad
Copy link
Contributor

jayfoad commented Oct 10, 2025

Why? I don't see any tests that show a benefit. I guess it could conceivably reduce register pressure, since the mask has to be materialized in a register, but that would only help if the shift amount has more uses, and there aren't any tests cases for that.

@AZero13 AZero13 closed this Oct 14, 2025
@AZero13 AZero13 deleted the shifts branch October 14, 2025 20:15
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants